import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
%matplotlib notebook
df = pd.read_excel("./dataset.xlsx")
df.head()
df.describe()
# Inital exploration into target customers regarding age
fig = px.box(df, y="age")
fig.show()
# Analyzingoutliers in amount
df2 = df[df["amount"]<200]
df2.sort_values(by="amount",axis=0, ascending=False)
# Intial exploration into transaction salesin different states
# There were outliers in some amounts
df = df[df["amount"] < 200]
m_c = df["merchant_state"].dropna().unique()
bar_df = df.groupby("merchant_state").agg({"amount":np.sum})
fig= px.bar(bar_df, x=bar_df.index, y='amount', color=m_c, labels={"amount":"Total Sales in AUD $", "merchant_state":"Merchant States"})
fig.show()
# Explortions on sales for individual days
df_date = df.groupby("date").agg({"amount":np.sum}).reset_index()
fig = px.line(df_date, x="date", y="amount", labels={"amount":"Total Sales in AUD $"})
fig.show()
# Explortions on sales for individual months
# Convert dates to months
month_dict = {8:"August", 9:"September",10:"October"}
df["month"]= df["date"].dt.month
df["month"] = df["month"].apply(lambda x : month_dict.get(x))
df_month = df.groupby("month").agg({"amount":np.sum}).reset_index()
fig= px.histogram(df_month, x="month", y="amount", color ="month", template="plotly_dark",
labels={"amount": "Total Sales in AUD $"})
fig.show()
# Constructing a side by side comparisons with total sales
fig = make_subplots(
rows=1, cols=2,
column_widths=[0.6, 0.4],
row_heights=[0.4],
specs=[[{"type": "bar"}, {"type": "bar"}]]
)
fig.add_trace(
go.Bar(x=df_date["date"], y=df_date["amount"] ,showlegend=False),
row=1, col=1
)
fig.add_trace(
go.Bar(x=df_month["month"], y=df_month["amount"] ,showlegend=False,marker={"color" :"seagreen"}),
row=1, col=2
)
fig.update_layout(title_text="Side by Side Comaprisons of Total Sales: Days vs Months")
fig.show()
# Explortions on avergae sales for individual days
df_date_av = df.groupby("date").agg({"amount":np.mean}).reset_index()
fig = px.line(df_date_av, x="date", y="amount", labels={"amount":"Average Sales in AUD $"})
fig.show()
df_month_av = df.groupby("month").agg({"amount":np.mean}).reset_index()
fig= px.histogram(df_month_av, x="month", y="amount", color ="month", template="plotly_dark",
labels={"amount": "Average Sales in AUD $"})
fig.show()
# Constructing a side by side comparisons with total sales
fig = make_subplots(
rows=1, cols=2,
column_widths=[0.6, 0.4],
row_heights=[0.4],
specs=[[{"type": "bar"}, {"type": "bar"}]]
)
fig.add_trace(
go.Bar(x=df_date_av["date"], y=df_date_av["amount"] ,showlegend=False),
row=1, col=1
)
fig.add_trace(
go.Bar(x=df_month_av["month"], y=df_month_av["amount"] ,showlegend=False,marker={"color" :"seagreen"}),
row=1, col=2
)
fig.update_layout(title_text="Side by Side Comaprisons of Average Sales: Days vs Months")
fig.show()
# Needed to scale values to compare average and total sales per day.
scaler = MinMaxScaler()
# Scaled df_date
df_date_scaled = pd.DataFrame(scaler.fit_transform(df_date["amount"].values.reshape(-1, 1)), columns=["amount"])
# Scaled df_date_av
df_date_av_scaled = pd.DataFrame(scaler.fit_transform(df_date_av["amount"].values.reshape(-1, 1)), columns=["amount"])
# Constructing a side by side comparisons with total sales and average sales per day
# Utilize scaled valuesfor comparisons
fig = make_subplots(
specs=[[{"secondary_y": True}]]
)
fig.add_trace(
go.Bar(name="Total Sales",x=df_date["date"], y=df_date_scaled["amount"] ,marker={"color" :"green"}),
row=1, col=1
)
fig.add_trace(
go.Scatter(name="Average Sales", x=df_date_av["date"], y=df_date_av_scaled["amount"] ,marker={"color" :"darkslateblue"}),
row=1, col=1
)
fig.update_layout(title_text="Variaition of Average Sales and Total Sales each Day", yaxis_title="Scaled Values of Amount in AUD $")
fig.update_yaxes(showticklabels=False)
fig.show()
# Analysis of total sales by tax description
df_tax_d = df.groupby("txn_description").agg({"amount":np.sum})
fig = px.pie(df_tax_d, values='amount', names=df_tax_d.index, title='Sales by Tax Description',
color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()
# Analyzing sales by total days
df_days=df.copy()
df_days["day"] = df["date"].dt.day
df_day_g = df_days.groupby("day").agg({"amount":np.sum})
fig= px.line(df_day_g, x=df_day_g.index, y="amount", labels={"amount": "Total Sales for a specific day in AUD $"})
fig.show()
# Analyzind data by laitude and longitude as a 3d visualization
df_lat_lon = pd.DataFrame(df["long_lat"].str.split(' ').tolist(),
columns = ['lon','lat'])
df_geol = df.copy()
df_geol["lon"] = df_lat_lon["lon"]
df_geol["lat"] = df_lat_lon["lat"]
df_geol=df_geol[["lat","lon","amount"]]
fig = go.Figure(data=[go.Surface(z=df_geol)])
fig.update_traces(contours_z=dict(show=True, usecolormap=True,
highlightcolor="limegreen", project_z=True))
fig.show()
# Analyzind data by laitude and longitude
df_lat_lon = pd.DataFrame(df["long_lat"].str.split(' ').tolist(),
columns = ['lon','lat'])
df_geol = df.copy()
df_g_g = df_geol.groupby(["long_lat"]).agg({"amount":np.sum})
fig = px.bar(df_g_g, x=df_g_g.index, y="amount", color="amount")
fig.update_layout(title_text="Total Sales by coordinates")
fig.update_yaxes(title="Total Amount in AUD $")
fig.update_xaxes(title="Latitude and Longitude")
fig.show()
- The merchant NSW has the most sales.
- The key demographics tagerted is 20 to 40 years of age.
- The highest income of sales is generated when the averge sales are low for the day, meaning that discounts help sales.
- The most sales occur on the 17th day of the month.